Load packages
rm(list = ls())
library(tictoc) # to monitor time
library(raster) # read raster files
library(rgdal) # Use GDAL functions
library(exactextractr) # Zonal statistics from raster
library(sf) # Working with shapefile layers
library(tidyverse) # data wrangling
library(plyr) # data wrangling
library(dplyr) # data wrangling
library(ggpmisc) # To place geom_text within plot
# library(ggmap) # Plot leaflet
# library(ggplot2) # Visualizations
library(plotly) # Interactive visualization
print("Packages loaded successfully!")
## [1] "Packages loaded successfully!"
Functions specific to this RMD
#########################################
# Creating function for exponential fit
myexpfit <- function(response_var, pred_var, mydata){
print(paste0("Model = ", "log(",response_var, ")~", pred_var))
model.linear <- lm(paste0("log(",response_var, ")~", pred_var), data = mydata)
a1 <- exp(model.linear$coefficients[1])
b1 <- (model.linear$coefficients[2])
R2 <- summary(model.linear)$r.squared
myresults <- c(a1, b1, R2)
return(myresults)
}
#########################################
Load dataset
# dat <- read.csv("deep-learning-all-features.csv", header = TRUE)
# save(dat, file = "Quantix_deep_learning.RData")
load("Quantix_deep_learning.RData", verbose = TRUE)
## Loading objects:
## dat
Data preprocessing
Convert a set of columns from numeric to factors
# Convert a set of columns from numeric to factors
fcols <- c("Fieldname", "FlightDate", "Week", "Type", "N", "Strip", "Mgmt_zone",
"DEM_type_Flat_Area", "DEM_type_Lower_Slope", "DEM_type_Middle_Slope",
"DEM_type_Ridge", "DEM_type_Upper_Slope", "DEM_type_Valley")
dat[fcols] <- lapply(dat[fcols], factor)
str(dat)
## 'data.frame': 1270534 obs. of 46 variables:
## $ Fieldname : Factor w/ 8 levels "DMD_GH1","PSF_111",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ FlightDate : Factor w/ 22 levels "2019/06/07","2019/06/09",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ DOY : int 158 158 158 158 158 158 158 158 158 158 ...
## $ Week : Factor w/ 14 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Longitude : num 43 43 43 43 43 ...
## $ Latitude : num -76.6 -76.6 -76.6 -76.6 -76.6 ...
## $ Yield : num 182 188 191 194 197 ...
## $ Yield_Mg.ha : num 11.4 11.8 12 12.2 12.4 ...
## $ Type : Factor w/ 2 levels "grain","silage": 1 1 1 1 1 1 1 1 1 1 ...
## $ N : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Strip : Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ rgn_red : num 18 18 18 18 18 18 18 18 18 18 ...
## $ rgn_green : num 16 15 15 15 15 15 15 15 15 16 ...
## $ rgn_nir : num 36 36 35 35 35 35 35 35 35 35 ...
## $ rgb_red : num 115 115 116 117 118 118 119 118 118 119 ...
## $ rgb_green : num 89 89 90 90 91 91 92 91 91 92 ...
## $ rgb_blue : num 67 67 68 68 69 69 70 69 69 70 ...
## $ NDVI : num 0.333 0.333 0.321 0.321 0.321 ...
## $ GNDVI : num 0.385 0.412 0.4 0.4 0.4 ...
## $ EVI2 : num 0.597 0.616 0.59 0.59 0.59 ...
## $ SR : num 2 2 1.94 1.94 1.94 ...
## $ EXG : num -4 -4 -4 -5 -5 -5 -5 -5 -5 -5 ...
## $ TGI : num 3.28 3.28 3.28 2.89 2.89 2.89 2.89 2.89 2.89 2.89 ...
## $ Mgmt_zone : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 2 2 2 2 ...
## $ Climod_GDD : int 10 10 10 10 10 10 10 10 10 10 ...
## $ Climod_Temp_C : num 15.6 15.6 15.6 15.6 15.6 ...
## $ Climod_Precip_mm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ACIS_GDD : int 10 10 10 10 10 10 10 10 10 10 ...
## $ ACIS_Temp_C : num 15.6 15.6 15.6 15.6 15.6 ...
## $ ACIS_Precip_mm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NASA_PAR : num 159 159 159 159 159 ...
## $ NASA_Temp_C : num 17.3 17.3 17.3 17.3 17.3 ...
## $ NASA_SH : num 7.2 7.2 7.2 7.2 7.2 7.2 7.2 7.2 7.2 7.2 ...
## $ NASA_RH : num 59.7 59.7 59.7 59.7 59.7 ...
## $ NASA_Precip_mm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NASA_WS : num 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 ...
## $ NASA_SurfWet : num 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 ...
## $ NASA_RootWet : num 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 ...
## $ NASA_ProfWet : num 0.66 0.66 0.66 0.66 0.66 0.66 0.66 0.66 0.66 0.66 ...
## $ DEM : int 125 125 125 125 125 125 125 125 125 125 ...
## $ DEM_type_Flat_Area : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DEM_type_Lower_Slope : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 2 ...
## $ DEM_type_Middle_Slope: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DEM_type_Ridge : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 1 1 ...
## $ DEM_type_Upper_Slope : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DEM_type_Valley : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
Check for missing values
# Check if any columns have missing values
sapply(dat, anyNA)
## Fieldname FlightDate DOY
## FALSE FALSE FALSE
## Week Longitude Latitude
## FALSE FALSE FALSE
## Yield Yield_Mg.ha Type
## FALSE FALSE FALSE
## N Strip rgn_red
## TRUE TRUE FALSE
## rgn_green rgn_nir rgb_red
## FALSE FALSE FALSE
## rgb_green rgb_blue NDVI
## FALSE FALSE FALSE
## GNDVI EVI2 SR
## FALSE FALSE FALSE
## EXG TGI Mgmt_zone
## FALSE FALSE TRUE
## Climod_GDD Climod_Temp_C Climod_Precip_mm
## FALSE FALSE FALSE
## ACIS_GDD ACIS_Temp_C ACIS_Precip_mm
## FALSE FALSE FALSE
## NASA_PAR NASA_Temp_C NASA_SH
## FALSE FALSE FALSE
## NASA_RH NASA_Precip_mm NASA_WS
## FALSE FALSE FALSE
## NASA_SurfWet NASA_RootWet NASA_ProfWet
## FALSE FALSE FALSE
## DEM DEM_type_Flat_Area DEM_type_Lower_Slope
## FALSE FALSE FALSE
## DEM_type_Middle_Slope DEM_type_Ridge DEM_type_Upper_Slope
## FALSE FALSE FALSE
## DEM_type_Valley
## FALSE
# Names of columns that has missing values
names(dat)[sapply(dat, anyNA)]
## [1] "N" "Strip" "Mgmt_zone"
Plot all yield data
# "DMD_GH1" "PSF_111" "PSF_12" "SLS_ABH" "SLS_NS" "SSF_121" "SSF_202" "SSF_66"
selfield <- "SSF_121"
sub_dat <- dat %>%
dplyr::filter(Fieldname == selfield) %>%
droplevels()
# Legend labeling - Splitting the yield range into equal classes
minyld <- round(min(sub_dat$Yield_Mg.ha),0) # minimum yield
maxyld <- round(max(sub_dat$Yield_Mg.ha),0) # maximum yield
Nclasses <- 4 # Number of yield classes to split from the whole yield range
mybreaks <- round(seq(minyld, maxyld, len = (Nclasses+1)),0)
YE_colors <- c("blue","cyan","lightgreen","yellow","orange", "red")
plt <- ggplot(sub_dat, aes(x = Latitude, y = Longitude)) +
geom_point(aes(color = Yield_Mg.ha), size = 0.7) +
theme_bw() +
# facet_grid(Field~Year, scales = "free") +
theme(aspect.ratio = 1.3) +
# ggtitle("Corn silage fields")+
# theme(legend.position = c(0.85, 0.27))+
scale_colour_gradientn(colours = YE_colors,
name = "Yield (Mg/ha)",
limits=c(minyld,maxyld),
breaks = mybreaks,
labels = round(mybreaks, 0)) +
guides(colour = guide_colourbar(barheight = unit(1.8, "in")))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(size=14, face = "bold")) +
theme(legend.title = element_text(face = "bold", size=8, margin = margin(0,0,7,0)),
legend.text = element_text(vjust = 0.5),
strip.text = element_text(face = "bold", size = 10))
plt

# png_save_name <- paste(selfield, "_", "allYear", ".png", sep = "")
#
# wd <- 8.5
# ht <- 3
# ggsave(png_save_name, width = wd, height = ht, unit = "in")
Exploratory data analysis
# Violin plot
vplt <- ggplot(dat, aes(x = Fieldname, y = Yield_Mg.ha, fill = Fieldname)) +
geom_violin() +
theme_bw() +
facet_wrap(~Type, nrow = 2, scales = "free_y")
vplt

Weather vs NDVI
# "DMD_GH1" "PSF_111" "PSF_12" "SLS_ABH" "SLS_NS" "SSF_121" "SSF_202" "SSF_66"
# "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
# Best grain field - DMD_GH1
# Best sialge filed - SSF_121, PSF_12
selfield <- "DMD_GH1"
# seltype <- "silage"
sub_dat <- dat %>%
filter(Fieldname == selfield) %>%
droplevels()
mu <- ddply(sub_dat, "Week", summarise, ndvi.mean = mean(NDVI), weather.mean = mean(NASA_PAR))
plt1 <- ggplot(mu, aes(x = weather.mean, y = ndvi.mean)) +
# geom_boxplot() +
geom_point(size = 2.5) +
geom_line() +
theme_bw() +
xlab("NASA_PAR") +
ylab("NDVI")
plt1

Subset only N strip data
ndat <- dat %>%
filter(!is.na(N))
names(ndat)[sapply(ndat, anyNA)]
## [1] "Mgmt_zone"
Subset results for one field
# "DMD_GH1" "PSF_111" "PSF_12" "SLS_ABH" "SLS_NS" "SSF_121" "SSF_202" "SSF_66"
# "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
# Best grain field - DMD_GH1
# Best sialge filed - SSF_121, PSF_12
selfield <- "SSF_121"
# seltype <- "silage"
sub_dat <- ndat %>%
filter(Fieldname == selfield) %>%
droplevels()
# "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
lvl <- levels(sub_dat$Week)
sub_dat$Strip <- as.numeric(sub_dat$Strip)
r2df <- data.frame()
for (i in 1:length(lvl)){
print(i)
wsub_dat <- sub_dat %>%
filter(Strip <= 4) %>%
filter(Week == lvl[i]) %>%
droplevels()
mod <- myexpfit("Yield_Mg.ha", "NDVI", wsub_dat)
r2 <- c(lvl[i], mod[1], mod[2], mod[3])
r2df <- rbind(r2df, r2)
print(paste0(i, " = ", mod[3]))
}
## [1] 1
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "1 = 0.108986777872688"
## [1] 2
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "2 = 0.033914704266872"
## [1] 3
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "3 = 0.0897048512691024"
## [1] 4
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "4 = 0.0353783962271664"
## [1] 5
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "5 = 0.254804905319651"
## [1] 6
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "6 = 0.532648000702073"
## [1] 7
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "7 = 0.446660462128851"
## [1] 8
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "8 = 0.362433519079672"
## [1] 9
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "9 = 0.359349081579489"
## [1] 10
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "10 = 0.201208422613502"
## [1] 11
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "11 = 0.267718102984678"
## [1] 12
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "12 = 0.243352476662727"
## [1] 13
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "13 = 0.302828726265256"
## [1] 14
## [1] "Model = log(Yield_Mg.ha)~NDVI"
## [1] "14 = 0.0905284550943136"
colnames(r2df) <- c("Weeks", "a", "b", "R2")
points_sf <- st_as_sf(sub_dat, coords = c("Longitude", "Latitude"), crs = 32618, remove = FALSE)
plot(points_sf["Yield_Mg.ha"])

numeric_cols <- c("Weeks", "a", "b", "R2")
r2df[numeric_cols] <- lapply(r2df[numeric_cols], as.numeric)
r2plot <- ggplot(r2df, aes(x = Weeks, y = R2)) +
geom_point(size = 2) +
geom_line() +
theme_bw() +
scale_y_continuous(breaks = seq(0,1.0, 0.1)) +
scale_x_continuous(breaks = seq(1,15, 1))
# r2plot
ggplotly(r2plot)
Pick the best week
best_week <- which.max(r2df$R2)
sub_dat <- dat %>%
filter(Week == best_week & Fieldname == selfield)
a_coeff <- r2df[best_week, 2]
b_coeff <- r2df[best_week, 3]
sub_dat$Pred_Yield <- a_coeff*exp(b_coeff*sub_dat$NDVI)
sub_dat$error <- (sub_dat$Pred_Yield - sub_dat$Yield_Mg.ha)
mse <- mean(sub_dat$error^2)
rmse <- sqrt(mse)
cat("RMSE for", selfield, "on week", best_week, "is", rmse, "Mg/ha")
## RMSE for SSF_121 on week 6 is 7.154346 Mg/ha
Save actual and predicted yield into a CSV
sel_dat <- sub_dat %>%
select(c(Longitude, Latitude, Pred_Yield, Yield_Mg.ha))
colnames(sel_dat) <- c("Longitude", "Latitude", "Predicted_yield", "Actual_yield")
field_save_name <- paste0(selfield, "_Week", best_week, ".csv")
# write.csv(sel_dat, field_save_name, row.names = FALSE)